Rolling functions with missing values

Required packages

Needed to visualize and simplify showing the results.

library(ggplot2)
library(dplyr)
library(tidyr)
library(purrr)
library(patchwork)
library(bench)

Cumulative sum (cumsum())

This function returns the cumulative sum of the elements of a vector.

[[cpp11::register]] doubles cumsum2_cpp_(doubles x, bool na_rm = false) {
  int n = x.size();

  writable::doubles out(n);
  out[0] = x[0];

  if (na_rm == true) {
    for (int i = 1; i < n; ++i) {
      double y1 = out[i - 1], y2 = x[i];
      if (ISNAN(y2)) {
        out[i] = y1 + 0.0;
      } else {
        if (ISNAN(y1)) {
          out[i] = 0.0 + y2;
        } else {
          out[i] = y1 + y2;
        }
      }
    }
  } else {
    for (int i = 1; i < n; ++i) {
      double y1 = out[i - 1], y2 = x[i];
      if (ISNAN(y2)) {
        out[i] = NA_REAL;
      } else {
        if (ISNAN(y1)) {
          out[i] = NA_REAL;
        } else {
          out[i] = y1 + y2;
        }
      }
    }
  }

  return out;
}

I also need to add the corresponding auxiliary function for the documentation.

#' Return the cumulative sum of the coordinates of a vector (C++)
#' @param x numeric vector
#' @param na_rm logical. Should missing values (including `NaN`) be removed?
#' @export
cumsum2_cpp <- function(x, na_rm = FALSE) {
  cumsum2_cpp_(as.double(x), na_rm = na_rm)
}

A benchmark of the two functions is the following.

set.seed(123) # for reproducibility
x <- rpois(1e6, lambda = 2) # 1,000,000 elements

cumsum(c(1, NA, 2, 4))
[1]  1 NA NA NA
cumsum2_cpp(c(1, NA, 2, 4))
[1]  1 NA NA NA
cumsum2_cpp(c(1, NA, 2, 4), na_rm = TRUE)
[1] 1 1 3 7
mark(
  cumsum(x),
  cumsum2_cpp(x)
)
# A tibble: 2 × 6
  expression          min   median `itr/sec` mem_alloc `gc/sec`
  <bch:expr>     <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
1 cumsum(x)        1.83ms   1.89ms     513.     3.81MB     73.2
2 cumsum2_cpp(x)  30.93ms  35.35ms      28.3   15.26MB     33.0

Cumulative product (cumprod())

The C++ implementation is similar to the previous part.

[[cpp11::register]] doubles cumprod2_cpp_(doubles x, bool na_rm = false) {
  int n = x.size();

  writable::doubles out(n);
  out[0] = x[0];

  if (na_rm == true) {
    for (int i = 1; i < n; ++i) {
      double y1 = out[i - 1], y2 = x[i];
      if (ISNAN(y2)) {
        out[i] = y1 * 1.0;
      } else {
        if (ISNAN(y1)) {
          out[i] = 1.0 * y2;
        } else {
          out[i] = y1 * y2;
        }
      }
    }
  } else {
    for (int i = 1; i < n; ++i) {
      double y1 = out[i - 1], y2 = x[i];
      if (ISNAN(y2)) {
        out[i] = NA_REAL;
      } else {
        if (ISNAN(y1)) {
          out[i] = NA_REAL;
        } else {
          out[i] = y1 * y2;
        }
      }
    }
  }

  return out;
}

I need an auxiliary function to cast the input as double.

#' Return the cumulative product of the coordinates of a vector (C++)
#' @inheritParams cumsum_r
#' @export
cumprod2_cpp <- function(x, na_rm = FALSE) {
  cumprod2_cpp_(as.double(x), na_rm = na_rm)
}

Test correctness and Benchmark.

cumprod(c(1, NA, 2, 4))
[1]  1 NA NA NA
cumprod2_cpp(c(1, NA, 2, 4))
[1]  1 NA NA NA
cumprod2_cpp(c(1, NA, 2, 4), na_rm = TRUE)
[1] 1 1 2 8
mark(
  cumprod(x),
  cumprod2_cpp(x)
)
# A tibble: 2 × 6
  expression           min   median `itr/sec` mem_alloc `gc/sec`
  <bch:expr>      <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
1 cumprod(x)        1.88ms   2.08ms     339.     15.3MB    418. 
2 cumprod2_cpp(x)  30.61ms  31.59ms      31.2    15.3MB     18.7

Cumulative minimum (cummin())

The C++ implementation is similar to the previous part.

[[cpp11::register]] doubles cummin2_cpp_(doubles x, bool na_rm = false) {
  int n = x.size();

  writable::doubles out(n);
  out[0] = x[0];

  if (na_rm == true) {
    for (int i = 1; i < n; ++i) {
      double y1 = x[i - 1], y2 = x[i];
      if (ISNAN(y1)) {
        out[i] = y2;
      } else {
        out[i] = std::min(y1, y2);
      }
    }
  } else {
    for (int i = 1; i < n; ++i) {
      double y1 = out[i - 1], y2 = x[i];
      if (ISNAN(y2)) {
        out[i] = NA_REAL;
      } else {
        if (ISNAN(y1)) {
          out[i] = NA_REAL;
        } else {
          out[i] = std::min(y1, y2);
        }
      }
    }
  }

  return out;
}

I also have to add the corresponding auxiliary function for the documentation.

#' Return the cumulative minimum of the coordinates of a vector (C++)
#' @inheritParams cumsum_r
#' @export
cummin2_cpp <- function(x, na_rm = FALSE) {
  cummin2_cpp_(as.double(x), na_rm = na_rm)
}

Test correctness and benchmark.

cummin(c(1, NA, 2, 4))
[1]  1 NA NA NA
cummin2_cpp(c(1, NA, 2, 4))
[1]  1 NA NA NA
cummin2_cpp(c(1, NA, 2, 4), na_rm = TRUE)
[1] 1 1 1 1
mark(
  cummin(x),
  cummin2_cpp(x)
)
# A tibble: 2 × 6
  expression          min   median `itr/sec` mem_alloc `gc/sec`
  <bch:expr>     <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
1 cummin(x)        1.16ms   1.24ms     647.     3.81MB     67.9
2 cummin2_cpp(x)  32.16ms  34.32ms      29.2   15.26MB     43.8

Cumulative maximum (cummax())

The C++ implementation is similar to the previous part.

[[cpp11::register]] doubles cummax2_cpp_(doubles x, bool na_rm = false) {
  int n = x.size();

  writable::doubles out(n);
  out[0] = x[0];

  if (na_rm == true) {
    for (int i = 1; i < n; ++i) {
      double y1 = out[i - 1], y2 = x[i];
      if (ISNAN(y1)) {
        out[i] = y2;
      } else {
        out[i] = std::max(y1, y2);
      }
    }
  } else {
    for (int i = 1; i < n; ++i) {
      double y1 = out[i - 1], y2 = x[i];
      if (ISNAN(y2)) {
        out[i] = NA_REAL;
      } else {
        if (ISNAN(y1)) {
          out[i] = NA_REAL;
        } else {
          out[i] = std::max(y1, y2);
        }
      }
    }
  }

  return out;
}

I also have to add the corresponding auxiliary function for the documentation.

#' Return the cumulative maximum of the coordinates of a vector (C++)
#' @inheritParams cumsum_r
#' @export
cummax2_cpp <- function(x, na_rm = FALSE) {
  cummax2_cpp_(as.double(x), na_rm = na_rm)
}

Test correctness and benchmark.

cummax(c(1, NA, 2, 4))
[1]  1 NA NA NA
cummax2_cpp(c(1, NA, 2, 4))
[1]  1 NA NA NA
cummax2_cpp(c(1, NA, 2, 4), na_rm = TRUE)
[1] 1 1 2 4
mark(
  cummax(x),
  cummax2_cpp(x)
)
# A tibble: 2 × 6
  expression          min   median `itr/sec` mem_alloc `gc/sec`
  <bch:expr>     <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
1 cummax(x)        1.06ms   1.13ms     868.     3.81MB     59.0
2 cummax2_cpp(x)  32.27ms  32.62ms      30.6   15.26MB     11.1

References